library(dplyr)
library(ggplot2)
library(cleandata)
library(corrplot)
library(gridExtra)
library(Metrics)
library(caret)
library(MASS)
library(robustbase)
library(cvTools)
library(sp)
library(rgdal)
library(geosphere)
library(dismo)
library(rgeos)
library(RANN)
median_house_value_scale = 100000
initialPreprocessing<-function(df) {
df$ocean_proximity<-as.factor(df$ocean_proximity)
op_order<-c("INLAND", "<1H OCEAN", "NEAR OCEAN","NEAR BAY", "ISLAND")
enc_ocean_proximity<-encode_ordinal(data.frame(enc_ocean_proximity=df[["ocean_proximity"]]), order=op_order, out.int=T, full_print = F)
df<-cbind(df, enc_ocean_proximity=enc_ocean_proximity)
if("median_house_value" %in% colnames(df)) {
df$median_house_value<-df$median_house_value / median_house_value_scale
}
return(df)
}
loadAndPreprocess<-function(csvName) {
return(initialPreprocessing(read.csv(csvName)))
}
all_data=loadAndPreprocess("train.csv")
coded 1 cols 5 levels
all_data
Es bastante claro que al menos en cierta medida, la proximidad al océano afecta el precio. Esto nos da una pista de que podemos codificar esta variable como ordinal. Movimos esto a nuestra función de pre-procesamiento arriba para aprovecharlo en cualquier dataset.
all_data %>%
group_by(ocean_proximity) %>%
summarize(mean_value = mean(median_house_value)) %>%
arrange(desc(mean_value))
Usamos summary para ver cuáles columnas tienen NAs y verificamos el valor mínimo de cada columna para asegurarnos que no hay NAs disfrazados de 0
summary(all_data)
Solamente total_bedrooms tiene faltantes y son 144, veamos cuánto es eso en porcentaje.
naCount<-sum(is.na(all_data$total_bedrooms))
naCount / length(all_data$total_bedrooms) * 100
Prácticamente 1% de datos faltantes. Nos ocuparemos de ellos, pero antes debemos divir los datos en train/test
set.seed(279720)
spec = c(train = .80, validate = .20)
#spec = c(train = .7, test = .15, validate = .15)
g = sample(cut(
seq(nrow(all_data)),
nrow(all_data)*cumsum(c(0,spec)),
labels = names(spec)
))
data = split(all_data, g)
nasRemoved<-data$train %>%
dplyr::select(-c(ocean_proximity, id)) %>%
filter(!is.na(total_bedrooms))
corrplot(cor(nasRemoved),
method = "ellipse",
type="full",
addCoef.col = rgb(0,0,0, alpha = 0.6), diag = TRUE, number.cex=0.77,
col= colorRampPalette(c("red","white", "green"))(100))
Vemos que nuestra variable con NAs (total_bedrooms) tiene correlación casi perfecta con households, por lo que usaremos el valor de esta para obtener datos de imputación.
imp_total_bedrooms<-function(df, traindata=nasRemoved) {
x<-traindata$households
y<-traindata$total_bedrooms
lr<-lm(y ~ x)
new<-data.frame(x = df$households)
df$total_bedrooms<-as.integer(ifelse(is.na(df$total_bedrooms),
predict(lr, new),
df$total_bedrooms))
return(df)
}
data$train<-imp_total_bedrooms(data$train)
data$validate<-imp_total_bedrooms(data$validate)
print(summary(data$train$total_bedrooms))
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.0 296.0 435.0 534.9 642.0 6445.0
print(summary(data$validate$total_bedrooms))
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.0 297.0 439.0 542.7 658.8 4492.0
plot_outliers<-function(df, colname) {
column<-sym(colname)
hist<-ggplot(df, aes(x=!!column))+
geom_histogram(color="white", fill="blue")+
theme_minimal()
box<-ggplot(df, aes(x=!!column))+
geom_boxplot()+
theme_minimal()
qq<-ggplot(df, aes(sample=!!column))+
stat_qq()+
stat_qq_line(col="red", lwd=1)
theme_minimal()
grid.arrange(hist, box, qq, ncol=3)
}
plotAllOutliers<-function(data) {
filtered<-data
if("id" %in% colnames(data)) {
filtered<-data %>%
dplyr::select(-c(ocean_proximity, id)) %>%
filter(!is.na(total_bedrooms))
}
for (col in names(filtered)) {
plot_outliers(filtered, col)
}
}
plotAllOutliers(data$train)
beds_per_rooms<-data$train$total_bedrooms / data$train$total_rooms
rooms_per_household<-data$train$total_rooms / data$train$households
income_per_capita<-data$train$median_income/data$train$population
income_per_household<-data$train$median_income/data$train$households
beds_per_capita<-data$train$total_bedrooms / data$train$population
rooms_per_capita<-data$train$total_rooms / data$train$population
pop_per_household<-data$train$population/data$train$households # check this one agian after doing something with the outliers
pop_per_bedroom<-data$train$population/data$train$total_bedrooms
pop_per_room<-data$train$population/data$train$total_rooms
candidates<-data.frame(beds_per_rooms, rooms_per_household, income_per_capita, income_per_household, beds_per_capita, rooms_per_capita, pop_per_household, pop_per_bedroom, pop_per_room, data$train$median_house_value)
summary(candidates)
corrplot(cor(candidates),
method = "ellipse",
type="full",
addCoef.col = rgb(0,0,0, alpha = 0.6), diag = TRUE, number.cex=0.7, tl.cex=0.75 ,
col= colorRampPalette(c("red","white", "green"))(100))
addExtraFeats<-function(df) {
if("beds_per_rooms" %in% colnames(df)) {
return(df)
}
beds_per_rooms<-df$total_bedrooms / df$total_rooms
rooms_per_household<-df$total_rooms / df$households
income_per_capita<-df$median_income/df$population
income_per_household<-df$median_income/df$households
beds_per_capita<-df$total_bedrooms / df$population
rooms_per_capita<-df$total_rooms / df$population
pop_per_household<-df$population/df$households
return(cbind(df, beds_per_rooms, rooms_per_capita, rooms_per_household, income_per_capita))
}
data$train<-addExtraFeats(data$train)
data$validate<-addExtraFeats(data$validate)
model_rmse<-function(model, observations, actual_values, scale=median_house_value_scale) {
predictions<-predict(model, observations)
return(rmse(predictions, actual_values)*scale)
}
scores<-function(model, xs=data$validate, y=data$validate$median_house_value, cost = rmspe, ...) {
folds <- cvFolds(nrow(xs), K = 5, R = 10)
return(data.frame(
validation_error=model_rmse(model, xs, y),
k_fold_cv_error=repCV(model, cost = cost, folds = folds, ...)$cv[[1]]*median_house_value_scale
))
}
extractTrainingVars<-function(data) {
return(data %>% dplyr::select(median_house_value,
median_income,
enc_ocean_proximity,
total_rooms,
latitude,
longitude,
total_bedrooms,
housing_median_age,
beds_per_rooms,
rooms_per_capita,
rooms_per_household,
population,
income_per_capita))
}
training_vars_1<-extractTrainingVars(data$train)
fit1<-lm(median_house_value ~ ., data = training_vars_1)
scores(fit1)
NA
outlier_values <- boxplot.stats(data$train$total_rooms)$out # outlier values.
boxplot(data$train$total_rooms, main="total_rooms", boxwex=0.1)
print(outlier_values)
outlier_values <- boxplot.stats(data$train$total_bedrooms)$out # outlier values.
boxplot(data$train$total_bedrooms, main="total_bedrooms", boxwex=0.1)
print(outlier_values)
outlier_values <- boxplot.stats(data$train$population)$out # outlier values.
boxplot(data$train$population, main="population", boxwex=0.1)
print(outlier_values)
outlier_values <- boxplot.stats(data$train$households)$out # outlier values.
boxplot(data$train$households, main="households", boxwex=0.1)
print(outlier_values)
outlier_values <- boxplot.stats(data$train$median_income)$out # outlier values.
boxplot(data$train$median_income, main="median_income", boxwex=0.1)
print(outlier_values)
outlier_values <- boxplot.stats(data$train$median_house_value)$out # outlier values.
boxplot(data$train$median_house_value, main="median_house_value", boxwex=0.1)
print(outlier_values)
cooksd<-cooks.distance(fit1)
plot(cooksd, pch="*", cex=1, main="Influential Obs by Cooks distance") # plot cook's distance
abline(h = 4*mean(cooksd, na.rm=T), col="red") # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # add labels
distances<-data.frame(d=cooksd[cooksd>4*mean(cooksd, na.rm=T)])
distances<-distances %>% arrange(desc(d))
indices<-as.integer(row.names(distances))
removeTopNOutliers<-function(n, indices, data) {
return( data[-match(indices[1:n], rownames(data)), ])
}
trainMinusTopOL<-removeTopNOutliers(3, indices, data$train)
trainMinusAllOL<-removeTopNOutliers(length(indices), indices, data$train)
fitTop3 <-lm(median_house_value ~ ., data = extractTrainingVars(trainMinusTopOL))
fitAll<-lm(median_house_value ~ ., data = extractTrainingVars(trainMinusAllOL))
scores(fitTop3 )
scores(fitAll)
data$train[match(indices[1:3], rownames(data$train)),]
summary(fitTop3)
Call:
lm(formula = median_house_value ~ ., data = extractTrainingVars(trainMinusTopOL))
Residuals:
Min 1Q Median 3Q Max
-3.8735 -0.3926 -0.0798 0.2980 4.9730
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.214e+01 1.046e+00 -30.727 < 2e-16 ***
median_income 4.280e-01 4.786e-03 89.444 < 2e-16 ***
enc_ocean_proximity 4.394e-02 9.341e-03 4.704 2.59e-06 ***
total_rooms -2.998e-06 1.222e-05 -0.245 0.80621
latitude -3.857e-01 1.099e-02 -35.089 < 2e-16 ***
longitude -3.789e-01 1.196e-02 -31.678 < 2e-16 ***
total_bedrooms 2.456e-04 6.557e-05 3.746 0.00018 ***
housing_median_age 1.027e-02 5.601e-04 18.330 < 2e-16 ***
beds_per_rooms 2.210e+00 1.723e-01 12.830 < 2e-16 ***
rooms_per_capita 5.535e-01 1.868e-02 29.624 < 2e-16 ***
rooms_per_household -1.840e-01 7.941e-03 -23.168 < 2e-16 ***
population -4.976e-05 1.703e-05 -2.922 0.00349 **
income_per_capita -1.869e+00 3.962e-01 -4.717 2.42e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6592 on 11541 degrees of freedom
Multiple R-squared: 0.6782, Adjusted R-squared: 0.6779
F-statistic: 2027 on 12 and 11541 DF, p-value: < 2.2e-16
summary(fitAll)
Call:
lm(formula = median_house_value ~ ., data = extractTrainingVars(trainMinusAllOL))
Residuals:
Min 1Q Median 3Q Max
-2.48273 -0.38226 -0.06987 0.30022 3.09619
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.104e+01 9.980e-01 -31.097 < 2e-16 ***
median_income 4.726e-01 5.029e-03 93.980 < 2e-16 ***
enc_ocean_proximity 2.629e-02 8.812e-03 2.984 0.00285 **
total_rooms -2.486e-05 1.279e-05 -1.943 0.05201 .
latitude -3.654e-01 1.051e-02 -34.753 < 2e-16 ***
longitude -3.598e-01 1.140e-02 -31.555 < 2e-16 ***
total_bedrooms 2.889e-04 6.983e-05 4.138 3.54e-05 ***
housing_median_age 1.219e-02 5.333e-04 22.867 < 2e-16 ***
beds_per_rooms 3.004e+00 1.883e-01 15.952 < 2e-16 ***
rooms_per_capita 6.237e-01 2.023e-02 30.837 < 2e-16 ***
rooms_per_household -1.917e-01 8.342e-03 -22.976 < 2e-16 ***
population -1.959e-05 1.951e-05 -1.004 0.31532
income_per_capita -3.863e+00 8.563e-01 -4.511 6.51e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6168 on 11418 degrees of freedom
Multiple R-squared: 0.7102, Adjusted R-squared: 0.7099
F-statistic: 2332 on 12 and 11418 DF, p-value: < 2.2e-16
outlierIndices<-as.numeric(names(car::outlierTest(fit1)[[1]]))
trainMinusAllOL2<-removeTopNOutliers(length(outlierIndices), outlierIndices, data$train)
fitAll2<-lm(median_house_value ~ ., data = extractTrainingVars(trainMinusAllOL2))
scores(fitAll2)
candidates<-data.frame(
original=data_train$total_rooms,
cap_default=cap(data_train$total_rooms)
,cap_5=cap(data_train$total_rooms, IQR_factor = 19)
)
plotAllOutliers(candidates)
compare_capped_performance(data_train, "total_rooms", candidates)
candidates<-data.frame(
original=data_train$total_bedrooms,
cap_default=cap(data_train$total_bedrooms)
,cap_2=cap(data_train$total_bedrooms, IQR_factor = 1.5)
,cap_3=cap(data_train$total_bedrooms, IQR_factor = 2)
,cap_5=cap(data_train$total_bedrooms, IQR_factor = 7)
,cap_5=cap(data_train$total_bedrooms, IQR_factor = 16)
)
plotAllOutliers(candidates)
compare_capped_performance(data_train, "total_bedrooms", candidates)
candidates<-data.frame(
original=data_train$population,
cap_default=cap(data_train$population)
,cap_6=cap(data_train$population, IQR_factor = 18)
)
plotAllOutliers(candidates)
compare_capped_performance(data_train, "population", candidates)
candidates<-data.frame(
original=data_train$households,
cap_default=cap(data_train$households)
,cap_2=cap(data_train$households, IQR_factor = 1.5)
,cap_6=cap(data_train$households, IQR_factor = 17.5)
)
plotAllOutliers(candidates)
compare_capped_performance(data_train, "households", candidates)
candidates<-data.frame(
original=data_train$median_income,
cap_default=cap(data_train$median_income)
,cap_2=cap(data_train$median_income, IQR_factor = 1.5)
,cap_3=cap(data_train$median_income, IQR_factor = 2) ##DING!
)
[1] "caps: 1.59103 7.359005"
[1] "thresholds: -0.721837500000001 8.0524625"
[1] "caps: 1.59103 7.359005"
[1] "thresholds: -0.721837500000001 8.0524625"
[1] "caps: 1.59103 7.359005"
[1] "thresholds: -1.818625 9.14925"
#plotAllOutliers(candidates)
compare_capped_performance(data_train, "median_income", candidates, extract = extractTrainingVars)
[1] "original"
[1] "66916.2373841967" "65258.6203328887"
[1] "cap_default"
[1] "68736.2718041317" "65228.7634702003"
[1] "cap_2"
[1] "68736.2718041317" "65239.9032777355"
[1] "cap_3"
[1] "68265.5792341056" "65022.490643416"
compare_capped_performance(data_train, "median_income", candidates)
[1] "original"
[1] "66899.9244852506" "65236.6103980749"
[1] "cap_default"
[1] "68665.1343300481" "65213.0030072346"
[1] "cap_2"
[1] "68665.1343300481" "65214.9882612058"
[1] "cap_3"
[1] "68195.6474577282" "65005.7533989622"
prueba<-data_train
prueba[["median_income"]]<-candidates$cap_3
fit_cap<-lm(median_house_value ~ ., data = extractAndRecalculateTrainingVars(prueba))
scores(fit_cap)
preProcess1<-preProcess(subset(data_train, select=-c(median_house_value, latitude, longitude, id)))
train1<-predict(preProcess1, data_train)
validate1<-predict(preProcess1, data$validate)
preProcess1<-preProcess(subset(data_train, select=-c(median_house_value, latitude, longitude, id)))
train1<-predict(preProcess1, data_train)
validate1<-predict(preProcess1, data$validate)
fitpp2<-lm(median_house_value ~ median_income+poly(median_income, 2) +
latitude * longitude +
enc_ocean_proximity +
population+poly(population, 2)+
total_rooms*total_bedrooms+
total_bedrooms+poly(total_bedrooms, 3)+
housing_median_age+poly(housing_median_age, 3)+
beds_per_rooms+poly(beds_per_rooms, 3)+
rooms_per_capita+
rooms_per_household+poly(rooms_per_household, 3)+
income_per_capita
, data = extractTrainingVars(train1))
summary(fitpp2)
Call:
lm(formula = median_house_value ~ median_income + poly(median_income,
2) + latitude * longitude + enc_ocean_proximity + population +
poly(population, 2) + total_rooms * total_bedrooms + total_bedrooms +
poly(total_bedrooms, 3) + housing_median_age + poly(housing_median_age,
3) + beds_per_rooms + poly(beds_per_rooms, 3) + rooms_per_capita +
rooms_per_household + poly(rooms_per_household, 3) + income_per_capita,
data = extractTrainingVars(train1))
Residuals:
Min 1Q Median 3Q Max
-3.5412 -0.3975 -0.0751 0.3065 5.0297
Coefficients: (6 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -96.888328 9.864448 -9.822 < 2e-16 ***
median_income 0.806447 0.011196 72.031 < 2e-16 ***
poly(median_income, 2)1 NA NA NA NA
poly(median_income, 2)2 -9.211385 0.720914 -12.777 < 2e-16 ***
latitude 1.599060 0.286084 5.589 2.33e-08 ***
longitude -0.933070 0.081789 -11.408 < 2e-16 ***
enc_ocean_proximity 0.054074 0.009473 5.708 1.17e-08 ***
population -0.040610 0.021237 -1.912 0.055878 .
poly(population, 2)1 NA NA NA NA
poly(population, 2)2 2.187558 1.205663 1.814 0.069642 .
total_rooms -0.039359 0.044417 -0.886 0.375576
total_bedrooms 0.105353 0.030696 3.432 0.000601 ***
poly(total_bedrooms, 3)1 NA NA NA NA
poly(total_bedrooms, 3)2 -5.820900 3.066821 -1.898 0.057718 .
poly(total_bedrooms, 3)3 1.201049 0.739795 1.623 0.104512
housing_median_age 0.145205 0.007093 20.471 < 2e-16 ***
poly(housing_median_age, 3)1 NA NA NA NA
poly(housing_median_age, 3)2 1.505512 0.672137 2.240 0.025117 *
poly(housing_median_age, 3)3 2.694581 0.667072 4.039 5.39e-05 ***
beds_per_rooms 0.206483 0.017256 11.966 < 2e-16 ***
poly(beds_per_rooms, 3)1 NA NA NA NA
poly(beds_per_rooms, 3)2 -1.696288 0.948529 -1.788 0.073748 .
poly(beds_per_rooms, 3)3 -5.406387 0.745723 -7.250 4.44e-13 ***
rooms_per_capita 0.487469 0.019043 25.599 < 2e-16 ***
rooms_per_household -0.320506 0.017479 -18.336 < 2e-16 ***
poly(rooms_per_household, 3)1 NA NA NA NA
poly(rooms_per_household, 3)2 -8.357469 1.040492 -8.032 1.05e-15 ***
poly(rooms_per_household, 3)3 3.850034 0.854482 4.506 6.68e-06 ***
income_per_capita -0.014765 0.006615 -2.232 0.025622 *
latitude:longitude 0.016318 0.002357 6.923 4.66e-12 ***
total_rooms:total_bedrooms 0.003357 0.007667 0.438 0.661451
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6486 on 11529 degrees of freedom
Multiple R-squared: 0.6888, Adjusted R-squared: 0.6882
F-statistic: 1063 on 24 and 11529 DF, p-value: < 2.2e-16
scores(fitpp2, xs=validate1, y=validate1$median_house_value)
prediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleading
66204.42 64151.6 66085.98 64009.12
65934.67 64135.29
65891.08 64631.93
65805.51 64390.26
65794.3 64320.67
65786.1 64297.81
full_threeway_model = lm(median_house_value ~ (.)^3, data = extractTrainingVars(train1))
summary(full_threeway_model)
Call:
lm(formula = median_house_value ~ (.)^3, data = extractTrainingVars(train1))
Residuals:
Min 1Q Median 3Q Max
-3.2030 -0.3168 -0.0579 0.2401 4.1918
Coefficients: (37 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.072e+02 2.342e+01 -4.575 4.80e-06 ***
median_income -4.859e+01 2.437e+01 -1.994 0.046171 *
enc_ocean_proximity -1.447e+02 1.206e+01 -11.996 < 2e-16 ***
total_rooms -1.379e+02 7.745e+01 -1.780 0.075044 .
latitude 1.803e+00 4.521e-01 3.989 6.67e-05 ***
longitude -1.040e+00 2.420e-01 -4.298 1.74e-05 ***
total_bedrooms 3.879e+01 8.719e+01 0.445 0.656422
housing_median_age 5.839e+01 1.192e+01 4.896 9.90e-07 ***
beds_per_rooms -1.129e+02 2.571e+01 -4.389 1.15e-05 ***
rooms_per_capita 4.789e+01 3.516e+01 1.362 0.173215
rooms_per_household -4.133e+01 3.135e+01 -1.319 0.187323
population 1.254e+02 4.758e+01 2.636 0.008412 **
income_per_capita 3.285e+01 1.551e+02 0.212 0.832297
median_income:enc_ocean_proximity -6.423e+00 2.261e+00 -2.840 0.004515 **
median_income:total_rooms 1.434e+01 7.056e+00 2.032 0.042131 *
median_income:latitude -1.654e-01 5.967e-01 -0.277 0.781604
median_income:longitude -5.561e-01 2.276e-01 -2.443 0.014567 *
median_income:total_bedrooms -1.008e+01 8.116e+00 -1.242 0.214176
median_income:housing_median_age 1.052e+00 1.755e+00 0.600 0.548818
median_income:beds_per_rooms -1.327e+00 4.698e+00 -0.282 0.777573
median_income:rooms_per_capita -1.204e+01 6.316e+00 -1.906 0.056657 .
median_income:rooms_per_household 9.921e+00 3.982e+00 2.492 0.012734 *
median_income:population -6.461e-01 5.542e+00 -0.117 0.907186
median_income:income_per_capita 4.459e+00 1.324e+00 3.368 0.000760 ***
enc_ocean_proximity:total_rooms -7.476e+00 7.653e+00 -0.977 0.328650
enc_ocean_proximity:latitude 4.188e+00 3.585e-01 11.681 < 2e-16 ***
enc_ocean_proximity:longitude -1.195e+00 9.944e-02 -12.016 < 2e-16 ***
enc_ocean_proximity:total_bedrooms -2.608e+00 8.025e+00 -0.325 0.745250
enc_ocean_proximity:housing_median_age -1.639e+01 1.426e+00 -11.497 < 2e-16 ***
enc_ocean_proximity:beds_per_rooms 3.522e+00 3.014e+00 1.169 0.242571
enc_ocean_proximity:rooms_per_capita -1.081e+01 4.084e+00 -2.648 0.008115 **
enc_ocean_proximity:rooms_per_household 1.769e+01 4.706e+00 3.758 0.000172 ***
enc_ocean_proximity:population 6.362e-01 4.934e+00 0.129 0.897412
enc_ocean_proximity:income_per_capita -1.526e+00 3.607e+00 -0.423 0.672182
total_rooms:latitude 2.122e+00 2.040e+00 1.040 0.298286
total_rooms:longitude -1.210e+00 6.830e-01 -1.772 0.076456 .
total_rooms:total_bedrooms -6.950e+00 1.945e+00 -3.574 0.000353 ***
total_rooms:housing_median_age -1.159e+01 7.707e+00 -1.504 0.132483
total_rooms:beds_per_rooms NA NA NA NA
total_rooms:rooms_per_capita -6.342e+00 1.538e+01 -0.413 0.679980
total_rooms:rooms_per_household -8.460e+00 1.444e+01 -0.586 0.558013
total_rooms:population -2.847e+00 2.969e+00 -0.959 0.337599
total_rooms:income_per_capita NA NA NA NA
latitude:longitude 1.873e-02 3.306e-03 5.667 1.49e-08 ***
latitude:total_bedrooms 6.886e-01 2.084e+00 0.330 0.741095
latitude:housing_median_age -2.527e+00 3.600e-01 -7.019 2.37e-12 ***
latitude:beds_per_rooms 2.372e+00 5.890e-01 4.028 5.67e-05 ***
latitude:rooms_per_capita -2.022e+00 1.018e+00 -1.986 0.047034 *
latitude:rooms_per_household 2.110e+00 9.314e-01 2.266 0.023487 *
latitude:population -3.335e+00 1.428e+00 -2.335 0.019575 *
latitude:income_per_capita 2.017e+00 1.742e+00 1.158 0.246933
longitude:total_bedrooms 4.055e-01 8.116e-01 0.500 0.617399
longitude:housing_median_age 3.993e-01 9.782e-02 4.081 4.51e-05 ***
longitude:beds_per_rooms -1.006e+00 2.384e-01 -4.221 2.45e-05 ***
longitude:rooms_per_capita 3.100e-01 3.017e-01 1.028 0.304120
longitude:rooms_per_household -2.634e-01 2.598e-01 -1.014 0.310512
longitude:population 1.039e+00 3.904e-01 2.662 0.007790 **
longitude:income_per_capita 4.930e-01 1.730e+00 0.285 0.775643
total_bedrooms:housing_median_age -7.772e+00 7.976e+00 -0.974 0.329906
total_bedrooms:beds_per_rooms -6.599e+00 7.408e+00 -0.891 0.373060
total_bedrooms:rooms_per_capita 1.446e+01 1.451e+01 0.996 0.319135
total_bedrooms:rooms_per_household 1.968e+00 1.308e+01 0.150 0.880378
total_bedrooms:population 9.064e+00 3.135e+00 2.891 0.003844 **
total_bedrooms:income_per_capita 6.266e+01 1.172e+02 0.535 0.592745
housing_median_age:beds_per_rooms 5.256e+00 2.270e+00 2.315 0.020625 *
housing_median_age:rooms_per_capita 1.333e+01 3.452e+00 3.862 0.000113 ***
housing_median_age:rooms_per_household -5.394e+00 3.211e+00 -1.680 0.093025 .
housing_median_age:population 1.915e+01 5.212e+00 3.675 0.000239 ***
housing_median_age:income_per_capita 1.850e+00 2.798e+00 0.661 0.508576
beds_per_rooms:rooms_per_capita -1.639e+01 5.072e+00 -3.231 0.001236 **
beds_per_rooms:rooms_per_household 7.677e+00 4.453e+00 1.724 0.084766 .
beds_per_rooms:population -9.446e+00 7.160e+00 -1.319 0.187120
beds_per_rooms:income_per_capita 7.413e+00 2.190e+00 3.385 0.000715 ***
rooms_per_capita:rooms_per_household 2.688e-02 7.693e-01 0.035 0.972126
rooms_per_capita:population NA NA NA NA
rooms_per_capita:income_per_capita 7.706e+00 2.742e+00 2.810 0.004961 **
rooms_per_household:population 1.176e-01 6.103e+00 0.019 0.984621
rooms_per_household:income_per_capita -3.660e+00 3.173e+00 -1.153 0.248768
population:income_per_capita NA NA NA NA
median_income:enc_ocean_proximity:total_rooms 1.831e-01 5.791e-02 3.161 0.001574 **
median_income:enc_ocean_proximity:latitude -3.957e-02 2.600e-02 -1.522 0.128016
median_income:enc_ocean_proximity:longitude -6.383e-02 2.624e-02 -2.432 0.015016 *
median_income:enc_ocean_proximity:total_bedrooms -4.896e-02 7.023e-02 -0.697 0.485683
median_income:enc_ocean_proximity:housing_median_age -2.709e-02 1.279e-02 -2.117 0.034282 *
median_income:enc_ocean_proximity:beds_per_rooms -1.560e-02 4.328e-02 -0.360 0.718596
median_income:enc_ocean_proximity:rooms_per_capita -1.198e-01 5.977e-02 -2.005 0.044995 *
median_income:enc_ocean_proximity:rooms_per_household 3.157e-02 3.505e-02 0.901 0.367753
median_income:enc_ocean_proximity:population -1.590e-01 5.846e-02 -2.720 0.006541 **
median_income:enc_ocean_proximity:income_per_capita 3.206e-02 1.115e-02 2.876 0.004035 **
median_income:total_rooms:latitude 2.071e-01 7.978e-02 2.596 0.009453 **
median_income:total_rooms:longitude 1.944e-01 8.126e-02 2.392 0.016754 *
median_income:total_rooms:total_bedrooms -8.605e-02 2.104e-02 -4.090 4.35e-05 ***
median_income:total_rooms:housing_median_age -2.954e-01 4.917e-02 -6.008 1.94e-09 ***
median_income:total_rooms:beds_per_rooms NA NA NA NA
median_income:total_rooms:rooms_per_capita -4.392e-02 9.322e-02 -0.471 0.637511
median_income:total_rooms:rooms_per_household 1.411e-02 8.133e-02 0.174 0.862238
median_income:total_rooms:population 5.662e-03 2.150e-02 0.263 0.792318
median_income:total_rooms:income_per_capita -1.793e+00 2.384e-01 -7.521 5.85e-14 ***
median_income:latitude:longitude 2.578e-03 4.788e-03 0.538 0.590300
median_income:latitude:total_bedrooms -1.556e-01 9.201e-02 -1.692 0.090750 .
median_income:latitude:housing_median_age -1.243e-02 1.854e-02 -0.671 0.502378
median_income:latitude:beds_per_rooms -1.582e-02 5.080e-02 -0.311 0.755548
median_income:latitude:rooms_per_capita -1.861e-01 6.839e-02 -2.721 0.006524 **
median_income:latitude:rooms_per_household 1.355e-01 4.195e-02 3.230 0.001241 **
median_income:latitude:population -5.546e-02 6.387e-02 -0.868 0.385182
median_income:latitude:income_per_capita 5.887e-02 1.597e-02 3.686 0.000229 ***
median_income:longitude:total_bedrooms -1.392e-01 9.343e-02 -1.490 0.136356
median_income:longitude:housing_median_age 5.563e-03 1.984e-02 0.280 0.779141
median_income:longitude:beds_per_rooms -1.368e-02 5.350e-02 -0.256 0.798177
median_income:longitude:rooms_per_capita -1.621e-01 7.197e-02 -2.252 0.024341 *
median_income:longitude:rooms_per_household 1.245e-01 4.495e-02 2.770 0.005615 **
median_income:longitude:population -2.628e-02 6.391e-02 -0.411 0.680864
median_income:longitude:income_per_capita 5.742e-02 1.521e-02 3.774 0.000161 ***
median_income:total_bedrooms:housing_median_age 1.743e-01 5.787e-02 3.013 0.002595 **
median_income:total_bedrooms:beds_per_rooms -9.119e-02 4.998e-02 -1.825 0.068087 .
median_income:total_bedrooms:rooms_per_capita 1.483e-01 9.419e-02 1.575 0.115297
median_income:total_bedrooms:rooms_per_household -1.375e-01 8.099e-02 -1.698 0.089617 .
median_income:total_bedrooms:population 5.588e-02 3.113e-02 1.795 0.072687 .
median_income:total_bedrooms:income_per_capita -9.635e-02 2.719e-01 -0.354 0.723122
median_income:housing_median_age:beds_per_rooms -3.128e-02 2.443e-02 -1.280 0.200470
median_income:housing_median_age:rooms_per_capita 9.968e-03 3.711e-02 0.269 0.788208
median_income:housing_median_age:rooms_per_household -7.985e-03 2.056e-02 -0.388 0.697792
median_income:housing_median_age:population 1.679e-01 4.631e-02 3.626 0.000289 ***
median_income:housing_median_age:income_per_capita 3.775e-03 9.212e-03 0.410 0.681946
median_income:beds_per_rooms:rooms_per_capita NA NA NA NA
median_income:beds_per_rooms:rooms_per_household -7.502e-02 3.808e-02 -1.970 0.048861 *
median_income:beds_per_rooms:population -6.240e-02 5.394e-02 -1.157 0.247397
median_income:beds_per_rooms:income_per_capita -1.471e-02 1.134e-02 -1.297 0.194632
median_income:rooms_per_capita:rooms_per_household -1.459e-02 3.885e-02 -0.376 0.707223
median_income:rooms_per_capita:population NA NA NA NA
median_income:rooms_per_capita:income_per_capita -2.865e-03 1.531e-02 -0.187 0.851603
median_income:rooms_per_household:population 2.244e-01 7.396e-02 3.034 0.002421 **
median_income:rooms_per_household:income_per_capita 2.631e-02 1.570e-02 1.676 0.093791 .
median_income:population:income_per_capita 1.615e+00 3.069e-01 5.264 1.44e-07 ***
enc_ocean_proximity:total_rooms:latitude -9.730e-02 8.951e-02 -1.087 0.277046
enc_ocean_proximity:total_rooms:longitude -9.195e-02 8.966e-02 -1.026 0.305129
enc_ocean_proximity:total_rooms:total_bedrooms 1.164e-02 2.264e-02 0.514 0.607095
enc_ocean_proximity:total_rooms:housing_median_age 7.723e-02 5.528e-02 1.397 0.162401
enc_ocean_proximity:total_rooms:beds_per_rooms NA NA NA NA
enc_ocean_proximity:total_rooms:rooms_per_capita -3.324e-01 1.344e-01 -2.474 0.013386 *
enc_ocean_proximity:total_rooms:rooms_per_household 1.892e-01 1.249e-01 1.514 0.130014
enc_ocean_proximity:total_rooms:population -1.688e-02 2.461e-02 -0.686 0.492731
enc_ocean_proximity:total_rooms:income_per_capita NA NA NA NA
enc_ocean_proximity:latitude:longitude 3.453e-02 2.931e-03 11.783 < 2e-16 ***
enc_ocean_proximity:latitude:total_bedrooms -2.625e-02 9.187e-02 -0.286 0.775045
enc_ocean_proximity:latitude:housing_median_age -1.571e-01 1.616e-02 -9.725 < 2e-16 ***
enc_ocean_proximity:latitude:beds_per_rooms 4.423e-02 3.392e-02 1.304 0.192282
enc_ocean_proximity:latitude:rooms_per_capita -1.268e-01 4.740e-02 -2.676 0.007462 **
enc_ocean_proximity:latitude:rooms_per_household 1.466e-01 5.437e-02 2.697 0.007011 **
enc_ocean_proximity:latitude:population 2.577e-02 5.722e-02 0.450 0.652379
enc_ocean_proximity:latitude:income_per_capita -6.554e-03 3.932e-02 -0.167 0.867625
enc_ocean_proximity:longitude:total_bedrooms -3.134e-02 9.341e-02 -0.336 0.737235
enc_ocean_proximity:longitude:housing_median_age -1.826e-01 1.649e-02 -11.075 < 2e-16 ***
enc_ocean_proximity:longitude:beds_per_rooms 4.306e-02 3.484e-02 1.236 0.216468
enc_ocean_proximity:longitude:rooms_per_capita -1.266e-01 4.762e-02 -2.658 0.007870 **
enc_ocean_proximity:longitude:rooms_per_household 1.902e-01 5.478e-02 3.472 0.000519 ***
enc_ocean_proximity:longitude:population 1.457e-02 5.727e-02 0.254 0.799162
enc_ocean_proximity:longitude:income_per_capita -2.042e-02 3.899e-02 -0.524 0.600510
enc_ocean_proximity:total_bedrooms:housing_median_age -1.844e-01 5.406e-02 -3.412 0.000647 ***
enc_ocean_proximity:total_bedrooms:beds_per_rooms -3.739e-02 5.657e-02 -0.661 0.508707
enc_ocean_proximity:total_bedrooms:rooms_per_capita 1.673e-01 1.185e-01 1.412 0.158114
enc_ocean_proximity:total_bedrooms:rooms_per_household -9.523e-02 1.092e-01 -0.872 0.383203
enc_ocean_proximity:total_bedrooms:population 1.317e-02 2.692e-02 0.489 0.624688
enc_ocean_proximity:total_bedrooms:income_per_capita 5.485e-01 1.157e+00 0.474 0.635343
enc_ocean_proximity:housing_median_age:beds_per_rooms 2.571e-02 1.747e-02 1.472 0.141181
enc_ocean_proximity:housing_median_age:rooms_per_capita 9.973e-02 2.612e-02 3.819 0.000135 ***
enc_ocean_proximity:housing_median_age:rooms_per_household -7.646e-02 2.806e-02 -2.725 0.006441 **
enc_ocean_proximity:housing_median_age:population 1.334e-01 4.170e-02 3.198 0.001387 **
enc_ocean_proximity:housing_median_age:income_per_capita 8.395e-03 1.891e-02 0.444 0.657159
enc_ocean_proximity:beds_per_rooms:rooms_per_capita -1.868e-01 4.172e-02 -4.479 7.59e-06 ***
enc_ocean_proximity:beds_per_rooms:rooms_per_household 1.241e-01 3.582e-02 3.466 0.000530 ***
enc_ocean_proximity:beds_per_rooms:population -1.556e-01 6.289e-02 -2.474 0.013374 *
enc_ocean_proximity:beds_per_rooms:income_per_capita 1.754e-02 2.353e-02 0.745 0.456022
enc_ocean_proximity:rooms_per_capita:rooms_per_household -8.721e-03 1.192e-02 -0.731 0.464547
enc_ocean_proximity:rooms_per_capita:population NA NA NA NA
enc_ocean_proximity:rooms_per_capita:income_per_capita 3.542e-02 2.547e-02 1.391 0.164305
enc_ocean_proximity:rooms_per_household:population -2.137e-01 8.805e-02 -2.427 0.015230 *
enc_ocean_proximity:rooms_per_household:income_per_capita -6.637e-03 3.216e-02 -0.206 0.836486
enc_ocean_proximity:population:income_per_capita NA NA NA NA
total_rooms:latitude:longitude 1.974e-02 1.675e-02 1.179 0.238419
total_rooms:latitude:total_bedrooms -7.009e-02 2.277e-02 -3.079 0.002084 **
total_rooms:latitude:housing_median_age -8.948e-02 8.389e-02 -1.067 0.286150
total_rooms:latitude:beds_per_rooms NA NA NA NA
total_rooms:latitude:rooms_per_capita -1.609e-01 1.540e-01 -1.045 0.296170
total_rooms:latitude:rooms_per_household -8.954e-02 1.422e-01 -0.630 0.528813
total_rooms:latitude:population -3.898e-02 3.414e-02 -1.142 0.253552
total_rooms:latitude:income_per_capita NA NA NA NA
total_rooms:longitude:total_bedrooms -7.547e-02 2.230e-02 -3.384 0.000717 ***
total_rooms:longitude:housing_median_age -1.196e-01 8.831e-02 -1.355 0.175475
total_rooms:longitude:beds_per_rooms NA NA NA NA
total_rooms:longitude:rooms_per_capita -1.013e-01 1.718e-01 -0.590 0.555384
total_rooms:longitude:rooms_per_household -1.031e-01 1.610e-01 -0.641 0.521691
total_rooms:longitude:population -3.893e-02 3.458e-02 -1.126 0.260355
total_rooms:longitude:income_per_capita NA NA NA NA
total_rooms:total_bedrooms:housing_median_age -3.397e-02 1.699e-02 -1.999 0.045624 *
total_rooms:total_bedrooms:beds_per_rooms 8.571e-02 3.871e-02 2.214 0.026829 *
total_rooms:total_bedrooms:rooms_per_capita -2.112e-02 1.362e-02 -1.551 0.120938
total_rooms:total_bedrooms:rooms_per_household 4.324e-02 1.876e-02 2.305 0.021212 *
total_rooms:total_bedrooms:population 7.581e-04 6.264e-04 1.210 0.226163
total_rooms:total_bedrooms:income_per_capita NA NA NA NA
[ reached getOption("max.print") -- omitted 99 rows ]
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.5599 on 11292 degrees of freedom
Multiple R-squared: 0.7729, Adjusted R-squared: 0.7677
F-statistic: 147.2 on 261 and 11292 DF, p-value: < 2.2e-16
scores(full_threeway_model, xs=validate1, y=validate1$median_house_value)
prediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleading
fitLmrob<-lmrob(median_house_value ~ median_income+poly(median_income, 2) +
latitude * longitude +
enc_ocean_proximity +
population+poly(population, 2)+
total_rooms*total_bedrooms+
total_bedrooms+poly(total_bedrooms, 3)+
housing_median_age+poly(housing_median_age, 3)+
beds_per_rooms+poly(beds_per_rooms, 3)+
rooms_per_capita+
rooms_per_household+poly(rooms_per_household, 3)+
income_per_capita
, data = extractTrainingVars(train1))
summary(fitLmrob)
scores(fitLmrob, xs=validate1, y=validate1$median_house_value, cost=rtmspe, trim=0.1)
fitLts<-ltsReg(median_house_value ~ median_income+poly(median_income, 2) +
latitude * longitude +
enc_ocean_proximity +
population+poly(population, 2)+
total_rooms*total_bedrooms+
total_bedrooms+poly(total_bedrooms, 3)+
housing_median_age+poly(housing_median_age, 3)+
beds_per_rooms+poly(beds_per_rooms, 3)+
rooms_per_capita+
rooms_per_household+poly(rooms_per_household, 3)+
income_per_capita
, data = extractTrainingVars(train1))
print("fitted")
scores(fitLts, cost=rtmspe, trim=0.1)
irls<-rlm(median_house_value ~ median_income+poly(median_income, 2) +
latitude * longitude +
enc_ocean_proximity +
population+poly(population, 2)+
total_rooms*total_bedrooms+
total_bedrooms+poly(total_bedrooms, 3)+
housing_median_age+poly(housing_median_age, 3)+
beds_per_rooms+poly(beds_per_rooms, 3)+
rooms_per_capita+
rooms_per_household+poly(rooms_per_household, 3)+
income_per_capita
, data = extractTrainingVars(train1))
irls_bi<-rlm(median_house_value ~median_income+poly(median_income, 2) +
latitude * longitude +
enc_ocean_proximity +
population+poly(population, 2)+
total_rooms*total_bedrooms+
total_bedrooms+poly(total_bedrooms, 3)+
housing_median_age+poly(housing_median_age, 3)+
beds_per_rooms+poly(beds_per_rooms, 3)+
rooms_per_capita+
rooms_per_household+poly(rooms_per_household, 3)+
income_per_capita
, data = extractTrainingVars(train1), psi = psi.bisquare)
summary(irls)
summary(irls_bi)
scores(irls, xs=validate1, y=validate1$median_house_value)
scores(irls_bi, xs=validate1, y=validate1$median_house_value)
timestamp<-function() {
my_options <- options(digits.secs = 3)
timestamp<-strftime(Sys.time(), "%m%d_%H%M%OS")
options(my_options)
return(timestamp)
}
writePredictions<-function(predictions, name, ids=testing_set$id) {
data<-cbind(id=ids, median_house_value=abs(predictions*median_house_value_scale))
filename<-paste("predictions/", timestamp(), "_", name, ".csv", sep="")
write.csv(data, filename, row.names = FALSE, quote=FALSE)
}
writePredictions1<-function(model, name, test_set=testing_set) {
writePredictions(predict(model, test_set), name)
}
testing_set<-loadAndPreprocess("test.csv")
coded 1 cols 5 levels
testing_set<-imp_total_bedrooms(testing_set)
testing_set<-addExtraFeats(testing_set)
testing_set<-predict(preProcess1, testing_set)
#writePredictions1(fitLmrob, "fitLmrob")
#writePredictions1(fitLts, "fitLts")
#writePredictions1(irls, "irls")
writePredictions1(full_threeway_model, "full_threeway_model")
x <- train1$longitude
y <- train1$latitude
xy <- SpatialPointsDataFrame(
matrix(c(x,y), ncol=2), data.frame(ID=train1$id),
proj4string=CRS("+proj=longlat +ellps=WGS84 +datum=WGS84"))
mdist <- distm(xy)
hc <- hclust(as.dist(mdist), method="complete")
d=40000 # radio en metros
xy$clust <- cutree(hc, h=d)
train1$clust <- factor(xy$clust)
cent <- matrix(ncol=2, nrow=max(xy$clust))
for (i in 1:max(xy$clust))
cent[i,] <- gCentroid(subset(xy, clust == i))@coords
xy@bbox[] <- as.matrix(extend(extent(xy),0.001))
ci <- circles(cent, d=d, lonlat=T)
plot(ci@polygons, axes=T)
plot(xy, col=rainbow(max(xy$clust))[factor(xy$clust)], add=T)
as.data.frame(table(xy$clust))
points<-data.frame(x=validate1$longitude, y=validate1$latitude)
validate1$clust<-factor(nn2(as.data.frame(cent), query=points,k=1)$nn.idx)
points<-data.frame(x=testing_set$longitude, y=testing_set$latitude)
testing_set$clust<-factor(nn2(as.data.frame(cent), query=points,k=1)$nn.idx)
OHE_clust<-function(df) {
col<-df %>% dplyr::select(clust)
encoder<-dummyVars("~.", data=col, x=factor(1:240))
cols<-data.frame(predict(encoder, newdata = col))
return(cbind(df, cols))
}
trainClust<-OHE_clust(train1)
validateClust<-OHE_clust(validate1)
testing_setClust<-OHE_clust(testing_set)
extractTrainingVarsClust<-function(data) {
return(data %>% dplyr::select(median_house_value,
median_income,
enc_ocean_proximity,
total_rooms,
latitude,
longitude,
total_bedrooms,
housing_median_age,
beds_per_rooms,
rooms_per_capita,
rooms_per_household,
population,
income_per_capita, starts_with("clust.")))
}
fitClust<-lm(median_house_value ~ median_income+poly(median_income, 2) +
latitude * longitude +
enc_ocean_proximity +
population+poly(population, 2)+
total_rooms*total_bedrooms+
total_bedrooms+poly(total_bedrooms, 3)+
housing_median_age+poly(housing_median_age, 3)+
beds_per_rooms+poly(beds_per_rooms, 3)+
rooms_per_capita+
rooms_per_household+poly(rooms_per_household, 3)+
income_per_capita + .
, data = extractTrainingVarsClust(trainClust))
summary(fitClust)
Call:
lm(formula = median_house_value ~ median_income + poly(median_income,
2) + latitude * longitude + enc_ocean_proximity + population +
poly(population, 2) + total_rooms * total_bedrooms + total_bedrooms +
poly(total_bedrooms, 3) + housing_median_age + poly(housing_median_age,
3) + beds_per_rooms + poly(beds_per_rooms, 3) + rooms_per_capita +
rooms_per_household + poly(rooms_per_household, 3) + income_per_capita +
., data = extractTrainingVarsClust(trainClust))
Residuals:
Min 1Q Median 3Q Max
-3.2312 -0.3147 -0.0533 0.2416 4.3074
Coefficients: (7 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.009e+03 1.168e+02 -8.635 < 2e-16 ***
median_income 5.628e-01 1.158e-02 48.599 < 2e-16 ***
poly(median_income, 2)1 NA NA NA NA
poly(median_income, 2)2 -3.712e+00 6.710e-01 -5.533 3.23e-08 ***
latitude 2.419e+01 3.269e+00 7.398 1.48e-13 ***
longitude -8.695e+00 9.777e-01 -8.894 < 2e-16 ***
enc_ocean_proximity 6.237e-02 1.566e-02 3.984 6.83e-05 ***
population -4.089e-02 1.941e-02 -2.107 0.035178 *
poly(population, 2)1 NA NA NA NA
poly(population, 2)2 3.321e+00 1.078e+00 3.080 0.002072 **
total_rooms 5.720e-02 3.954e-02 1.446 0.148078
total_bedrooms 3.076e-02 2.777e-02 1.108 0.267898
poly(total_bedrooms, 3)1 NA NA NA NA
poly(total_bedrooms, 3)2 -1.772e+00 2.720e+00 -0.651 0.514815
poly(total_bedrooms, 3)3 6.853e-02 6.591e-01 0.104 0.917191
housing_median_age 4.232e-02 7.385e-03 5.730 1.03e-08 ***
poly(housing_median_age, 3)1 NA NA NA NA
poly(housing_median_age, 3)2 1.660e+00 6.358e-01 2.610 0.009066 **
poly(housing_median_age, 3)3 2.747e+00 6.073e-01 4.523 6.17e-06 ***
beds_per_rooms 7.938e-02 1.690e-02 4.696 2.68e-06 ***
poly(beds_per_rooms, 3)1 NA NA NA NA
poly(beds_per_rooms, 3)2 3.166e+00 8.885e-01 3.564 0.000367 ***
poly(beds_per_rooms, 3)3 -6.293e+00 6.797e-01 -9.257 < 2e-16 ***
rooms_per_capita 4.735e-01 1.768e-02 26.774 < 2e-16 ***
rooms_per_household -2.678e-01 1.686e-02 -15.882 < 2e-16 ***
poly(rooms_per_household, 3)1 NA NA NA NA
poly(rooms_per_household, 3)2 -1.193e+01 1.035e+00 -11.532 < 2e-16 ***
poly(rooms_per_household, 3)3 5.879e+00 8.168e-01 7.198 6.49e-13 ***
income_per_capita -1.210e-02 5.876e-03 -2.059 0.039503 *
clust.1 -4.019e+00 8.911e-01 -4.510 6.54e-06 ***
clust.2 -2.734e+00 9.067e-01 -3.016 0.002569 **
clust.3 -2.966e+00 9.042e-01 -3.280 0.001040 **
clust.4 -1.394e+00 6.382e-01 -2.184 0.028987 *
clust.5 -1.354e+00 6.272e-01 -2.159 0.030898 *
clust.6 -1.569e+00 6.656e-01 -2.357 0.018418 *
clust.7 -3.172e+00 8.493e-01 -3.735 0.000189 ***
clust.8 -1.626e+00 6.555e-01 -2.481 0.013103 *
clust.9 -3.677e+00 8.945e-01 -4.110 3.98e-05 ***
clust.10 -3.546e+00 8.765e-01 -4.046 5.25e-05 ***
clust.11 -3.163e+00 8.985e-01 -3.520 0.000433 ***
clust.12 -2.039e+00 6.587e-01 -3.095 0.001972 **
clust.13 -8.955e-01 1.107e+00 -0.809 0.418699
clust.14 -8.430e-01 1.038e+00 -0.812 0.416790
clust.15 -1.642e+00 9.695e-01 -1.693 0.090458 .
clust.16 -2.787e+00 9.279e-01 -3.004 0.002674 **
clust.17 -1.150e+00 1.148e+00 -1.001 0.316750
clust.18 3.336e+00 6.211e-01 5.371 7.97e-08 ***
clust.19 -3.409e+00 7.702e-01 -4.426 9.68e-06 ***
clust.20 -1.088e+00 1.031e+00 -1.055 0.291231
clust.21 -1.554e+00 6.669e-01 -2.330 0.019799 *
clust.22 -1.783e+00 9.399e-01 -1.897 0.057913 .
clust.23 -3.388e+00 9.353e-01 -3.622 0.000293 ***
clust.24 -8.905e-01 5.886e-01 -1.513 0.130345
clust.25 -1.861e+00 6.484e-01 -2.870 0.004107 **
clust.26 -1.551e+00 6.436e-01 -2.410 0.015972 *
clust.27 -3.340e+00 7.138e-01 -4.679 2.92e-06 ***
clust.28 -3.154e+00 8.862e-01 -3.559 0.000374 ***
clust.29 -2.911e+00 7.720e-01 -3.770 0.000164 ***
clust.30 -3.209e+00 8.042e-01 -3.991 6.62e-05 ***
clust.31 -6.069e-01 5.933e-01 -1.023 0.306405
clust.32 -8.021e-01 6.168e-01 -1.300 0.193473
clust.33 -1.715e+00 9.382e-01 -1.828 0.067582 .
clust.34 -3.424e+00 7.371e-01 -4.645 3.44e-06 ***
clust.35 -3.327e+00 9.184e-01 -3.622 0.000294 ***
clust.36 -2.015e+00 6.756e-01 -2.982 0.002867 **
clust.37 -3.883e+00 9.009e-01 -4.311 1.64e-05 ***
clust.38 -3.851e+00 7.781e-01 -4.949 7.56e-07 ***
clust.39 -1.838e+00 9.768e-01 -1.882 0.059913 .
clust.40 -1.476e+00 6.444e-01 -2.291 0.021986 *
clust.41 -3.650e+00 8.928e-01 -4.089 4.37e-05 ***
clust.42 -3.111e+00 7.833e-01 -3.972 7.17e-05 ***
clust.43 -3.403e+00 8.094e-01 -4.204 2.64e-05 ***
clust.44 -1.299e+00 6.359e-01 -2.042 0.041140 *
clust.45 -4.589e+00 8.146e-01 -5.634 1.81e-08 ***
clust.46 -3.542e+00 9.098e-01 -3.893 9.97e-05 ***
clust.47 -3.051e+00 8.603e-01 -3.547 0.000392 ***
clust.48 -1.332e+00 6.355e-01 -2.096 0.036122 *
clust.49 -3.988e+00 8.642e-01 -4.615 3.98e-06 ***
clust.50 -2.170e+00 9.165e-01 -2.368 0.017918 *
clust.51 -1.012e+00 6.057e-01 -1.671 0.094841 .
clust.52 -6.876e-01 1.050e+00 -0.655 0.512555
clust.53 -6.675e-01 5.967e-01 -1.119 0.263317
clust.54 -1.655e+00 6.538e-01 -2.531 0.011383 *
clust.55 -3.499e+00 9.277e-01 -3.771 0.000163 ***
clust.56 1.007e-01 1.293e+00 0.078 0.937910
clust.57 -2.730e+00 9.819e-01 -2.781 0.005430 **
clust.58 -3.274e+00 9.187e-01 -3.564 0.000367 ***
clust.59 -3.540e+00 7.953e-01 -4.451 8.63e-06 ***
clust.60 -3.181e+00 9.102e-01 -3.495 0.000475 ***
clust.61 -1.230e+00 6.299e-01 -1.952 0.050911 .
clust.62 -1.473e+00 1.014e+00 -1.453 0.146288
clust.63 -2.887e+00 7.865e-01 -3.671 0.000243 ***
clust.64 -1.741e+00 1.000e+00 -1.740 0.081834 .
clust.65 -2.510e+00 9.173e-01 -2.736 0.006229 **
clust.66 -2.734e-01 6.099e-01 -0.448 0.653998
clust.67 -2.142e+00 9.525e-01 -2.248 0.024570 *
clust.68 -2.187e+00 6.835e-01 -3.199 0.001381 **
clust.69 -3.710e+00 8.703e-01 -4.263 2.04e-05 ***
clust.70 -3.190e+00 7.082e-01 -4.505 6.71e-06 ***
clust.71 -2.709e+00 9.509e-01 -2.848 0.004401 **
clust.72 4.068e-01 6.029e-01 0.675 0.499824
clust.73 -3.473e+00 8.846e-01 -3.926 8.67e-05 ***
clust.74 -1.460e+00 9.862e-01 -1.481 0.138662
clust.75 -2.591e+00 8.717e-01 -2.973 0.002956 **
clust.76 -1.281e+00 9.868e-01 -1.299 0.194094
clust.77 -2.780e+00 6.946e-01 -4.002 6.33e-05 ***
clust.78 -2.683e+00 9.605e-01 -2.793 0.005224 **
clust.79 -3.325e+00 8.805e-01 -3.776 0.000160 ***
clust.80 1.727e+00 5.904e-01 2.925 0.003454 **
clust.81 -2.996e+00 9.940e-01 -3.014 0.002588 **
clust.82 -3.777e+00 8.534e-01 -4.426 9.69e-06 ***
clust.83 -3.384e+00 8.693e-01 -3.893 9.98e-05 ***
clust.84 -2.752e+00 7.988e-01 -3.445 0.000573 ***
clust.85 -3.200e+00 8.111e-01 -3.945 8.02e-05 ***
clust.86 -7.728e-01 1.010e+00 -0.765 0.444297
clust.87 -4.642e+00 8.550e-01 -5.429 5.79e-08 ***
clust.88 -4.374e+00 8.677e-01 -5.041 4.71e-07 ***
clust.89 -3.402e+00 8.293e-01 -4.103 4.11e-05 ***
clust.90 -1.716e+00 9.342e-01 -1.837 0.066214 .
clust.91 -3.299e+00 8.634e-01 -3.821 0.000133 ***
clust.92 -9.330e-01 6.277e-01 -1.486 0.137189
clust.93 -1.975e+00 9.193e-01 -2.149 0.031679 *
clust.94 -3.025e+00 9.353e-01 -3.234 0.001224 **
clust.95 -7.324e-01 5.941e-01 -1.233 0.217683
clust.96 -1.268e+00 6.400e-01 -1.982 0.047507 *
clust.97 -3.538e+00 9.593e-01 -3.688 0.000227 ***
clust.98 -1.338e+00 1.041e+00 -1.285 0.198788
clust.99 -2.169e+00 9.838e-01 -2.204 0.027512 *
clust.100 -5.182e+00 7.810e-01 -6.635 3.39e-11 ***
clust.101 1.716e+00 5.828e-01 2.944 0.003247 **
clust.102 -2.482e+00 9.108e-01 -2.726 0.006429 **
clust.103 4.337e-01 1.241e+00 0.349 0.726794
clust.104 -1.034e+00 6.791e-01 -1.522 0.127959
clust.105 -4.482e+00 8.356e-01 -5.364 8.29e-08 ***
clust.106 -1.614e+00 8.825e-01 -1.829 0.067470 .
clust.107 -3.116e+00 8.054e-01 -3.869 0.000110 ***
clust.108 -3.158e+00 9.782e-01 -3.229 0.001248 **
clust.109 -2.112e+00 9.124e-01 -2.315 0.020622 *
clust.110 -2.466e+00 9.367e-01 -2.632 0.008488 **
clust.111 -2.878e+00 7.535e-01 -3.819 0.000135 ***
clust.112 -1.034e+00 6.273e-01 -1.648 0.099316 .
clust.113 -4.907e+00 8.143e-01 -6.026 1.74e-09 ***
clust.114 -3.533e+00 8.299e-01 -4.258 2.08e-05 ***
clust.115 -2.066e+00 7.322e-01 -2.821 0.004788 **
clust.116 -4.269e+00 8.722e-01 -4.895 9.99e-07 ***
clust.117 -2.283e-01 6.160e-01 -0.371 0.710919
clust.118 -3.955e+00 7.544e-01 -5.242 1.62e-07 ***
clust.119 -2.852e+00 9.400e-01 -3.034 0.002420 **
clust.120 -4.702e-01 1.205e+00 -0.390 0.696473
clust.121 -3.365e+00 8.461e-01 -3.977 7.03e-05 ***
clust.122 -2.704e+00 6.954e-01 -3.889 0.000101 ***
clust.123 -4.830e+00 8.503e-01 -5.681 1.37e-08 ***
clust.124 -1.459e+00 8.686e-01 -1.680 0.092968 .
clust.125 -1.861e+00 7.069e-01 -2.632 0.008498 **
clust.126 -4.450e+00 7.947e-01 -5.599 2.21e-08 ***
clust.127 -3.595e+00 8.586e-01 -4.187 2.85e-05 ***
clust.128 -3.741e+00 7.627e-01 -4.905 9.46e-07 ***
clust.129 -2.388e+00 7.261e-01 -3.288 0.001011 **
clust.130 -2.775e+00 7.548e-01 -3.676 0.000238 ***
clust.131 4.702e-01 6.231e-01 0.755 0.450436
clust.132 -3.031e+00 9.964e-01 -3.042 0.002359 **
clust.133 -1.135e+00 6.546e-01 -1.734 0.082968 .
clust.134 -5.511e+00 7.826e-01 -7.042 2.01e-12 ***
clust.135 -3.053e+00 7.362e-01 -4.147 3.39e-05 ***
clust.136 -5.060e+00 9.352e-01 -5.410 6.42e-08 ***
clust.137 -3.428e+00 9.836e-01 -3.485 0.000494 ***
clust.138 -3.300e+00 8.756e-01 -3.769 0.000164 ***
clust.139 -1.617e+00 1.061e+00 -1.524 0.127467
clust.140 -2.823e+00 9.151e-01 -3.085 0.002037 **
clust.141 -2.239e+00 6.934e-01 -3.229 0.001247 **
clust.142 1.647e-01 6.725e-01 0.245 0.806492
clust.143 -3.270e+00 8.397e-01 -3.894 9.90e-05 ***
clust.144 3.768e-01 8.332e-01 0.452 0.651101
clust.145 -3.317e+00 8.277e-01 -4.007 6.18e-05 ***
clust.146 -2.219e+00 9.847e-01 -2.253 0.024259 *
clust.147 -3.174e+00 1.146e+00 -2.770 0.005607 **
clust.148 -4.876e+00 8.288e-01 -5.883 4.15e-09 ***
clust.149 -3.050e+00 8.945e-01 -3.410 0.000652 ***
clust.150 4.130e-01 7.009e-01 0.589 0.555648
clust.151 2.994e-01 1.153e+00 0.260 0.795196
clust.152 -4.437e+00 8.571e-01 -5.177 2.29e-07 ***
clust.153 -1.012e+00 1.104e+00 -0.917 0.359106
clust.154 -3.786e+00 8.756e-01 -4.324 1.55e-05 ***
clust.155 2.016e-02 6.552e-01 0.031 0.975457
clust.156 -2.941e+00 9.463e-01 -3.108 0.001890 **
clust.157 -8.405e-01 1.173e+00 -0.716 0.473725
clust.158 -3.383e-01 1.103e+00 -0.307 0.759089
clust.159 -2.839e+00 1.045e+00 -2.716 0.006616 **
clust.160 -1.694e+00 7.884e-01 -2.149 0.031643 *
clust.161 -3.788e-01 6.499e-01 -0.583 0.560022
clust.162 -1.219e+00 1.091e+00 -1.118 0.263739
clust.163 -2.170e-01 6.214e-01 -0.349 0.726884
clust.164 -1.691e+00 6.928e-01 -2.441 0.014682 *
clust.165 -2.683e+00 7.731e-01 -3.470 0.000522 ***
clust.166 -2.868e+00 8.597e-01 -3.336 0.000854 ***
clust.167 -3.030e+00 1.074e+00 -2.821 0.004792 **
clust.168 -3.771e+00 9.484e-01 -3.976 7.06e-05 ***
clust.169 -3.037e+00 8.829e-01 -3.440 0.000584 ***
clust.170 -3.089e+00 7.807e-01 -3.957 7.63e-05 ***
clust.171 7.853e-01 1.270e+00 0.618 0.536523
[ reached getOption("max.print") -- omitted 71 rows ]
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.5702 on 11290 degrees of freedom
Multiple R-squared: 0.7645, Adjusted R-squared: 0.759
F-statistic: 139.4 on 263 and 11290 DF, p-value: < 2.2e-16
scores(fitClust, xs=validateClust, y=validateClust$median_house_value)
prediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleadingprediction from a rank-deficient fit may be misleading
writePredictions1(fitClust, "fitClust40km", test_set=testing_setClust)
prediction from a rank-deficient fit may be misleading